
*** CLEANING CPS DATA (2010-2018) ***
*** This script processes and cleans CPS data for analysis. ***

capture clear all  // Clears all data and resets memory (error-safe)
set more off  // Prevents the "more" prompt from pausing execution

*** STEP 1: Set working directory to the Project/ Folder ***

*** STEP 2: Load dataset ***

use "Data/InputData/CPS_Thesis.dta", clear

*** STEP 3: Keep only data from 2010-2018 ***
*** Filters dataset to include only relevant years. ***

keep if year >= 2010 & year <= 2018

*** STEP 4: Keep only employed individuals ***
*** Excludes those who are unemployed or not in the labor force. ***

keep if empstat == 10 | empstat == 12  

*** STEP 5: Keep only relevant industries ***
*** Focuses on specific industries relevant to the analysis. ***

keep if ind == 8660 | (ind >= 5700 & ind <= 5790) | ind == 8680

*** STEP 6: Remove extreme wage values ***
*** Drops implausibly low and high wages to prevent distortion. ***

drop if hourwage < 3 | hourwage >= 999.99

*** STEP 7: Standardize work hours variable ***
*** Renames and cleans `uhrsworkt`, which represents usual hours worked. ***

capture confirm variable uhrsworkt
if _rc == 0 rename uhrsworkt hours_worked

* Convert hours_worked to numeric if necessary
capture confirm numeric variable hours_worked
if _rc != 0 destring hours_worked, replace force

* Replace "varies" (coded as 997) with missing values
replace hours_worked = . if hours_worked == 997  

* Ensure no missing values remain by setting them to zero
replace hours_worked = 0 if missing(hours_worked)


*** STEP 8: Ensure wage and work hours are numeric ***


capture confirm numeric variable hourwage
if _rc != 0 destring hourwage, replace force

capture confirm numeric variable hours_worked
if _rc != 0 destring hours_worked, replace force


*** STEP 9: Remove outdated key variables if they exist ***
*** Ensures calculations start fresh without conflicting variables. ***


capture drop log_hourwage fulltime calc_earnweek


*** STEP 10-11: Ensure no missing values in wage and work hours ***


drop if missing(hourwage)
drop if missing(hours_worked)


*** STEP 12: Create key variables ***


* Log of hourly wage for easier interpretation
gen log_hourwage = log(hourwage) if hourwage > 0  

* Define full-time employment as 35+ hours per week
gen fulltime = (hours_worked >= 35)


*** STEP 13: Calculate weekly earnings ***
*** This is computed as hourly wage multiplied by hours worked per week. ***


gen calc_earnweek = hourwage * hours_worked


*** STEP 14: Remove extreme weekly earnings ***
*** Ensures that unusually high values do not distort analysis. ***


quietly count if !missing(calc_earnweek)

if r(N) > 0 {
    * Identify the 99th percentile of weekly earnings
    centile calc_earnweek, centile(99)

    * Store the threshold value
    local p99 = r(c_1)

    * Debugging step: Display the threshold value
    if "`p99'" != "" {
        display "99th percentile of calc_earnweek: `p99'"

        * Apply threshold only if the stored value is valid
        capture confirm number `p99'
        if _rc == 0 & !missing(`p99') replace calc_earnweek = . if calc_earnweek > `p99'
    }
}


*** STEP 15: Save cleaned dataset ***


save "Data/IntermediateData/cleaned_cps_2010_2018.dta", replace


*** STEP 16: Verify results ***


summarize hours_worked hourwage log_hourwage calc_earnweek
describe hours_worked hourwage log_hourwage calc_earnweek


*** STEP 17: Check for missing values and anomalies ***


tab hours_worked, missing
misstable summarize

